Supervised Learning

Loading Libraries

library(dplyr)
library(cluster)
library(dplyr)
library(factoextra)
library(ggplot2)
library(gridExtra)
library(caret)
library(mclust)
library(knitr)
library(dbscan)
library(fpc)
library(FactoMineR)
library(rrcov)

Preprocessing Data

# Loading the dataset 
url <- "https://raw.githubusercontent.com/jldbc/coffee-quality-database/master/data/arabica_data_cleaned.csv"
coffee <- read.csv(url)
# Remove empty spaces and replace it with NA
coffee[coffee == ""] <- NA
coffee[coffee == " "] <- NA



# Total Cup Points, Grade and Country of Origin
coffee <- coffee[coffee$Total.Cup.Points != 0, ]
# Define the new conditions and corresponding labels
new_conditions <- c(-Inf, 79, 84.99, 90, Inf)  # Updated conditions
new_labels <- c("Commodity", "Very Good", "Excellent", "Outstanding")
# Create the 'Grade' column based on the updated conditions
coffee$Grade <- cut(coffee$Total.Cup.Points, breaks = new_conditions, labels = new_labels, right = TRUE)
coffee$Country.of.Origin <- ifelse(is.na(coffee$Country.of.Origin), "Colombia",coffee$Country.of.Origin)

# Checked the NA in Country of Origin and replaced it within the Columbia after checking the Region
coffee$Country.of.Origin <- ifelse(is.na(coffee$Country.of.Origin), "Colombia",coffee$Country.of.Origin)



# Altitude Mean Meters 
# Make a copy of the variable and keep in the column altitude_mean_meters_new
coffee$altitude_mean_meters_new = coffee$altitude_mean_meters
# Correction of error      
coffee$altitude_mean_meters_new = ifelse(coffee$altitude_mean_meters_new == 190164, 1901.64, coffee$altitude_mean_meters_new)
# Create a function to convert the instances where coffee is registered in feet to meters
feet_to_meters <- function(feet) {
  conversion_factor <- 0.3048
  meters <- feet * conversion_factor
  return(meters)
}
# Altitude that was registered by the Coffee Quality Institution and the Country of origin was Myanmar were recorded in feet
condition_1 = coffee$In.Country.Partner == "Coffee Quality Institute"
condition_2 = coffee$Country.of.Origin == "Myanmar"
rows_to_update <- c(216, 838, 1002, 1270)
coffee$altitude_mean_meters_new[rows_to_update] <- feet_to_meters(coffee$altitude_mean_meters_new[rows_to_update])
# Correction of data entry and conversion errors
coffee$altitude_mean_meters_new[c(544, 629, 1041, 1204)] <- c(1100, 1800, 1100, 1200)
# We will be storing the rows to be updated and multiply the values by 1000
rows_to_update = c(482, 280, 614, 684, 738, 762, 781,  839, 840, 878, 964)
coffee$altitude_mean_meters_new[rows_to_update] = coffee$altitude_mean_meters_new[rows_to_update] * 1000
rows_to_update_2 = c(42, 43, 786, 899)
coffee$altitude_mean_meters_new[rows_to_update_2] = coffee$altitude_mean_meters_new[rows_to_update_2] * 100



# Bag Weight and Total Weight 
# We will create a new variable called Num.Bag.Weight to convert the characters to numeric value.
coffee$Num.Bag.Weight = coffee$Bag.Weight

#Treating the Bag.Weight variable by converting its lbs value to kg and then to numeric
# Identify values in pounds
is_lbs <- grepl(" lbs", coffee$Num.Bag.Weight)
# Identify values in kg
is_kg <-grepl(" kg", coffee$Num.Bag.Weight)
# Remove " lbs" from variable
coffee$Num.Bag.Weight[is_lbs] <- gsub(" lbs", "", coffee$Num.Bag.Weight[is_lbs])
# Remove " kg" from variable
coffee$Num.Bag.Weight[is_kg] <- gsub(" kg", "", coffee$Num.Bag.Weight[is_kg])
# Convert pound values to kg and store them back as numeric value
coffee$Num.Bag.Weight[is_lbs] <- as.numeric(coffee$Num.Bag.Weight[is_lbs]) * 0.45359237
# Convert kg values to numeric
coffee$Num.Bag.Weight[is_kg] <- as.numeric(coffee$Num.Bag.Weight[is_kg])
# Round the values to decimal places
coffee$Num.Bag.Weight <- round(as.numeric(coffee$Num.Bag.Weight), 2)
coffee$Total.Weight = (coffee$Num.Bag.Weight * coffee$Number.of.Bags)


# Log(x + 1) tranfromation to values that are skewed data with 0 values
coffee <- coffee %>%
  mutate(
    Log.Total.Weight = log(Total.Weight + 1),
    Log.Category.One.Defects = log(Category.One.Defects + 1),
    Log.Category.Two.Defects = log(Category.Two.Defects + 1))


# Reordering, Missing Categories and Character to Factor
coffee_clean <- dplyr::select(coffee, Grade, Total.Cup.Points, Aroma, Flavor, Aftertaste, 
                              Acidity, Body, Balance, Uniformity, Clean.Cup,
                             Sweetness, Cupper.Points, Log.Category.One.Defects, Log.Category.Two.Defects, 
                            Quakers, Moisture, altitude_mean_meters_new, Log.Total.Weight, 
                        Variety, Color, Processing.Method, Country.of.Origin, In.Country.Partner)

# Replace the missing categorical NAs with "Unknown"
coffee_clean <- coffee_clean %>%
  mutate(
    Color = coalesce(Color, "Unknown"),
    Variety = coalesce(Variety, "Unknown"),
    Processing.Method = coalesce(Processing.Method, "Unknown"),
    
  )

# Convert variables stored as character into factors 
convert_characters_to_factors <- function(data) {
  # Identify character columns
  char_cols <- sapply(data, is.character)
  
  # Convert character columns to factors
  data[char_cols] <- lapply(data[char_cols], as.factor)
  
  # Return the modified data frame
  return(data)
}



coffee_clean <- convert_characters_to_factors(coffee_clean)
coffee_complete =  coffee_clean[complete.cases(coffee_clean), ]

These are the questions we would like to answer:

  1. Can we find characteristics that help us identify sensory and non-sensory characteristics that differentiate the quality of coffee beans?
  2. Are there natural groupings based on sensory and non-sensory features?
  3. Can we find distinct clusters based on the sensory and the non-sensory features for coffee bean grade?

We would like to find patterns that we may discover and that may not be apparent at a first glance. Hopefully identifying clusters will help us gain an understanding of characteristics of the different types of coffee. Let us start by checking our data target variable.

# Calculate the percentage of each grade
grade_percentages <- prop.table(table(coffee_complete$Grade)) * 100

# Create a data frame with grade levels and percentages
grade_data <- data.frame(Grade = names(grade_percentages),
                         Percentage = as.numeric(grade_percentages))

# Reorder grade levels based on percentage
grade_data$Grades <- factor(grade_data$Grade, levels = grade_data$Grade[order(-grade_data$Percentage)])

# Create the plot using ggplot2
barplot_grades <- ggplot(grade_data, aes(x = Grades, y = Percentage, fill = Grades)) +
  geom_bar(stat = "identity", alpha = 0.5, fill = "orange") +
  labs(title = "Distribution of Coffee Grades",
       x = "Grade", y = "Percentage") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")

# Display the plot
print(barplot_grades)

table(coffee_complete$Grade)
## 
##   Commodity   Very Good   Excellent Outstanding 
##          82         920          77           1

Functions Used for Clustering

compute_cluster_stats <- function(data,num_dimensions,  distance = "euclidean") {
  # Compute the dissimilarity matrix
  if (distance == "gower") {
    diss <- daisy(data$ind$coord[, 1:num_dimensions], metric = distance)
  } else {
    diss <- dist(data$ind$coord[, 1:num_dimensions], method = distance)
  }
  
  # Elbow plot
  elbow <- fviz_nbclust(data$ind$coord[, 1:num_dimensions], pam, method = "wss")
  
  # Silhouette plot
  silhouette <- fviz_nbclust(data$ind$coord[, 1:num_dimensions], pam, method = "silhouette") + theme_minimal()
  
  # Calculate gap statistic
  gap_stat_calc <- clusGap(data$ind$coord[, 1:num_dimensions], FUN = pam, K.max = 10, B = 50)
  gap_stat <- fviz_gap_stat(gap_stat_calc)
  
  # Arrange the plots in a grid
  plot <- grid.arrange(elbow, silhouette, gap_stat, nrow = 2, ncol = 2)
  
  # Return the plot
  return(plot)
}


perform_pam_and_plot <- function(data, true_labels, centers, num_dimensions, distance = "euclidean") {
  # Initialize a list to store the plots
  plots <- list()
  
  # Create a data frame for the true labels
  df_true <- data.frame(data$ind$coord[, 1:num_dimensions], True = true_labels)
  
  # Create the plot for true labels
  plot_true <- ggplot(df_true, aes(x = `Dim.1`, y = `Dim.2`, color = True)) +
    geom_point(alpha = 0.6, size = 3) +
    scale_color_brewer(palette = "Set1") +
    theme_minimal() +
    labs(
      title = "Ground Truth",
      x = "Principal Component 1",
      y = "Principal Component 2",
      color = "True Label"
    ) +
    theme(
      plot.title = element_text(hjust = 0.5),
      legend.position = "bottom"
    )
  
  # Add the true labels plot to the list
  plots <- list(plot_true)
  
  # Compute the dissimilarity matrix
  if (distance == "gower") {
    diss <- daisy(data$ind$coord[, 1:num_dimensions], metric = distance)
  } else {
    diss <- dist(data$ind$coord[, 1:num_dimensions], method = distance)
  }
  
  # Loop over the centers
  for(i in seq_along(centers)) {
    # Perform PAM clustering
    pam_result <- pam(diss, k = centers[i])
    
    # Create a data frame for the predicted clusters
    df_pam <- data.frame(data$ind$coord[, 1:num_dimensions], Predicted = as.factor(pam_result$clustering))
    
    # Create the plot for predicted clusters
    plot_predicted <- ggplot(df_pam, aes(x = `Dim.1`, y = `Dim.2`, color = Predicted)) +
      geom_point(alpha = 0.6, size = 3) +
      scale_color_brewer(palette = "Set1") +
      theme_minimal() +
      labs(
        title = paste("Predicted Clusters (k =", centers[i], ")"),
        x = "Principal Component 1",
        y = "Principal Component 2",
        color = "Predicted Cluster"
      ) +
      theme(
        plot.title = element_text(hjust = 0.5),
        legend.position = "bottom"
      )
    
    # Add the predicted clusters plot to the list
    plots <- c(plots, list(plot_predicted))
  }
  
  # Calculate the number of rows and columns for the grid
  ncol <- 3
  nrow <- ceiling(length(plots) / ncol)
  
  # Arrange the plots in a grid
  do.call(grid.arrange, c(plots, ncol = ncol, nrow = nrow))
}


compute_silhouette <- function(data, centers, num_dimensions, distance = "euclidean") {
  # Initialize a list to store the plots
  plots <- list()
  
  # Compute the dissimilarity matrix
  if (distance == "gower") {
    diss <- daisy(data$ind$coord[, 1:num_dimensions], metric = distance)
  } else {
    diss <- dist(data$ind$coord[, 1:num_dimensions], method = distance)
  }
  
  # Loop over the centers
  for(i in seq_along(centers)) {
    # Perform PAM clustering
    pam_result <- pam(diss, k = centers[i])
    
    # Check if there are clusters with only one observation
    if(any(table(pam_result$clustering) == 1)) {
      next
    }
    
    # Compute silhouette information
    silhouette_info <- silhouette(pam_result$clustering, diss)
    
    # Check if silhouette computation was successful
    if(is.logical(silhouette_info)) {
      next
    }
    
    # Create silhouette plot
    silhouette_plot <- fviz_silhouette(silhouette_info, palette = "jco", ggtheme = theme_classic())
    
    # Add the silhouette plot to the list
    plots[[i]] <- silhouette_plot
  }
  
do.call(grid.arrange, c(plots, nrow = 2))
}

compute_rand_indices <- function(data, true_labels, centers, num_dimensions, distance = "euclidean") {
  # Initialize a data frame to store the results
  results <- data.frame()
  
  # Compute the dissimilarity matrix
  if (distance == "gower") {
    diss <- daisy(data$ind$coord[, 1:num_dimensions], metric = distance)
  } else {
    diss <- dist(data$ind$coord[, 1:num_dimensions], method = distance)
  }
  
  # Loop over the centers
  for(i in seq_along(centers)) {
    # Perform PAM clustering
    pam_result <- pam(diss, k = centers[i])
    
    # Compute the Rand Index
    rand_index <- adjustedRandIndex(true_labels, pam_result$clustering)
    
    # Add the result to the data frame
    results <- rbind(results, data.frame(NumClusters = centers[i], RandIndex = rand_index))
  }
  
  # Return the results
  return(results)
}

Sensory Data Clustering

In this section we will select sensory data from the coffee_complete data base that we have prepared in the pre-processing stage. The sensory data chosen to work on for the unsupervised learning are scores that are given by the Q-graders to determine the quality of coffee. So we subset the features Aroma, Flavor, Acidity, Body, Balance and Cupper.Points to determine if these subjective parameters will enable us to identify distinct groups. In this instance we have decided to drop Uniformity, Sweetness and Clean.Cup because the values are uniform and show no varability. It is especially important to remove these features because to conduct PCA features that do not provide variability and do not contribute to the variance will not add any useful information to our anlaysis

Prepare Data with sensory Features

# Select the sensory variables
sensory_data <- coffee_complete[, c("Aroma", "Flavor", "Aftertaste", "Acidity", "Body", "Balance")]

Principal Component Analayis (PCA)

In order to begin the analysis for the sensory data we start by conducting a Principal Component Analysis (PCA) so that we can identify the main sensory features that differentiate coffee beans. This will enable us to focus extract new features from the data that are a linear combination of the original variables. Since the senory attributes are highly correlated with each other this will be great step in order to untangle them and capture them and focusing on the data with the largerst variance.

# Perform PCA
pca_sensory <- PCA(sensory_data, scale.unit=TRUE, ncp=5, graph=FALSE)
summary(pca_sensory)
## 
## Call:
## PCA(X = sensory_data, scale.unit = TRUE, ncp = 5, graph = FALSE) 
## 
## 
## Eigenvalues
##                        Dim.1   Dim.2   Dim.3   Dim.4   Dim.5   Dim.6
## Variance               4.464   0.465   0.379   0.313   0.242   0.138
## % of var.             74.394   7.747   6.311   5.222   4.029   2.297
## Cumulative % of var.  74.394  82.140  88.452  93.674  97.703 100.000
## 
## Individuals (the 10 first)
##                Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
## 1          |  8.772 |  8.707  1.573  0.985 |  0.425  0.036  0.002 | -0.526
## 2          |  8.117 |  8.052  1.345  0.984 |  0.649  0.084  0.006 |  0.000
## 3          |  6.996 |  6.987  1.013  0.997 | -0.028  0.000  0.000 | -0.041
## 4          |  6.949 |  6.822  0.965  0.964 | -0.740  0.109  0.011 | -0.421
## 5          |  6.759 |  6.682  0.926  0.977 | -0.503  0.050  0.006 | -0.514
## 8          |  6.797 |  6.746  0.944  0.985 | -0.580  0.067  0.007 | -0.147
## 9          |  7.772 |  7.726  1.238  0.988 |  0.633  0.080  0.007 |  0.268
## 10         |  6.340 |  5.947  0.734  0.880 |  0.690  0.095  0.012 | -1.170
## 11         |  5.997 |  5.681  0.669  0.897 |  1.012  0.204  0.028 | -1.258
## 12         |  5.627 |  5.596  0.650  0.989 |  0.309  0.019  0.003 | -0.418
##               ctr   cos2  
## 1           0.068  0.004 |
## 2           0.000  0.000 |
## 3           0.000  0.000 |
## 4           0.043  0.004 |
## 5           0.065  0.006 |
## 8           0.005  0.000 |
## 9           0.018  0.001 |
## 10          0.335  0.034 |
## 11          0.387  0.044 |
## 12          0.043  0.006 |
## 
## Variables
##               Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3    ctr   cos2
## Aroma      |  0.811 14.732  0.658 |  0.473 48.191  0.224 |  0.253 16.884  0.064
## Flavor     |  0.927 19.238  0.859 |  0.123  3.233  0.015 | -0.052  0.701  0.003
## Aftertaste |  0.911 18.579  0.829 |  0.009  0.017  0.000 |  0.024  0.154  0.001
## Acidity    |  0.837 15.701  0.701 |  0.044  0.409  0.002 | -0.515 70.014  0.265
## Body       |  0.824 15.206  0.679 | -0.408 35.770  0.166 |  0.117  3.626  0.014
## Balance    |  0.859 16.543  0.738 | -0.240 12.379  0.058 |  0.181  8.620  0.033
##             
## Aroma      |
## Flavor     |
## Aftertaste |
## Acidity    |
## Body       |
## Balance    |

The first principal component explains approximately 74.39% of the total variance, while the second principal component explains 7.74% of the variance. Cumulative proportion of variance that is explained by the first two principal component is around 82.1% of the total variance in the data.This shows that the data can largely be summarized using the first two component.

scree_sensory <- fviz_eig(pca_sensory, addlabels = TRUE, ylim = c(0, 80), main = "Scree Plot Sensory Data", fill = "orange")
contribution_sensory <- fviz_cos2(pca_sensory, choice = "var", axes = 1:2)
var_contrbution_sensory <- fviz_pca_var(pca_sensory, repel = TRUE, 
                col.var = "contrib", # Color by contributions to the PC
                gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07") )
habillage_var <- fviz_pca_biplot(pca_sensory, label= "var", habillage = coffee_complete$Grade)
grid.arrange(scree_sensory, contribution_sensory, var_contrbution_sensory, habillage_var, ncol = 2, nrow = 2)

In order to visualize the the result from the principal component anlaysis on the sensory features we will be using various plots. The first one we have utilized is the scree plot which plots the variance explained by the principal components. We can see in that the first principal component explains around 74% of the variance and the second principal component explains 7.7% of the variance. This will enable us to decide on the amount of dimensions from the principal component we should utilize for further analysis.

The second plot shows the representation of each of the variables. We have used the first two dimensions to determine how each of the variables are represented. The higher level of the cosine squared the better the variable is represented. Hence we can see from the plot that Flavor and Aroma are well-represented, followed by Aftertaste.

The third plot we have utilized is the variable contribution plot which shows the contribution of each plot to the dimensions of the principal components. From the plot we can see that Aroma and Flavor contribute significantly to the first principal component where as the Aroma and Body significantly to the second principal component.

Lastly the biplot allows us to combine the information of the individual data points and the variables. The projects of the individual data points and the sensory variables onto the principal components. We can see from the plot that coffee that have are similar will be close together on the plot and variables that are correlated will point to the same direction.

From the biplot something we can take away is that the Aroma, Flavor, Aftertaste and Acidity are close and pointing to the same direction suggesting that tend to increase and decrease together. We can see from the individual points that the coffee that are graded commodity (red points) have lower values for most sensory attributes. On the other hand coffee graded outstanding (purple point) tens to have higher value indicating that these having a high score of these sensory attribute increases the potential grade of the coffee. The Body attribute is less correlated with the other sensory attribute and this could be influenced by other factors such as the brewing methods which is not captured.

K-Medoids

Before clustering the data points on we will be assessing the optimal number of clusters that we need using the three common techniques. We are using the PCA of the data in order to determine the optimal number of data clusters.

sensory_plot_clusters <- compute_cluster_stats(pca_sensory,2, distance = "euclidean")

We have used the “elbow method” which utilizes the within cluster sum of squares and evaluates where it decreases at a slower rate in order to determine the optimal number of clusters. We can see that this occurs at 3 or 4 suggesting that could be the best number of cluster.

Secondly we have utilized the silhouette method which measures how similar each data point is within it own cluster compared to other clusters. And based on the vertical line we can see that best number of cluster is 2 according to the silhouette method.

Lastly we have utilized the gap statistics which compares the variation intra-clusters for different values of k with expected values under null reference distribution of data. The gap statistic suggests that the best number of cluster would be to have no clusters. But usually since the higher gap statistics suggest the best value for clustering we will consider 4.

pam_sensory <- perform_pam_and_plot(pca_sensory, coffee_complete$Grade, centers = c(2, 3, 4, 5, 6), 2, distance = "euclidean")

The plot below shows the results of PAM clustering on the two principal component dimensions data with different levels of clustering that we have obtained from the elbow, silhouette and the gap statistics methods. We have used the ground truth as a way to compare the results that we get from the remaining clusters.

With k = 2 we can see that the data points are divided into two clusters. When compared to the ground truth we can see that finer group distinction are not captured. With k = 3, the separation somewhat resembles the ground truth but far from perfect. As we increase we can seem to be replicating the true label. We can see that this is occurring by the Very Good graded coffee from the true label being split into further clusters and the Commodity and Excellent coffee being represented better in when the number of clusters reaches six.

# Use the function
silhouette_plots_sensory <- compute_silhouette(pca_sensory, centers = c(2, 3, 4, 5), 2, distance = "euclidean")
##   cluster size ave.sil.width
## 1       1  642          0.46
## 2       2  438          0.45
##   cluster size ave.sil.width
## 1       1  330          0.28
## 2       2  489          0.48
## 3       3  261          0.38
##   cluster size ave.sil.width
## 1       1  221          0.19
## 2       2  341          0.41
## 3       3  309          0.38
## 4       4  209          0.31
##   cluster size ave.sil.width
## 1       1   73          0.25
## 2       2  193          0.36
## 3       3  328          0.33
## 4       4  290          0.36
## 5       5  196          0.31

The plots above are a way to show the similarity of each data point within its cluster compared to other data points in other clusters. The higher the average the silhouette width indictaed that the points are well clustered. When our number of cluster is k = 2, the average silhouette is 0.44, which indicates correct clustering. By increasing the number of clusters to k = 3, we can see that the average silhouette width decreases to 0.38, with some data points in cluster 1 and 3 belonging to the adjacent cluster. However all three clusters are positive and they indicate uniqueness. As we increase the number of clusters we can see that the average silhouette drops to 0.34 with all clusters having data points that belong in the adjacent clusters. Thus it would be wise to consider the number of clusters that would have two to three clusters because theu offer a reasonable of distinct groups while also maintaining higher avereage silhouette width.

rand_indices_sensory <- compute_rand_indices(pca_sensory, coffee_complete$Grade, centers = 2:7, 2, distance = "euclidean")

rand_indices_sensory
##   NumClusters  RandIndex
## 1           2 0.03961925
## 2           3 0.10958356
## 3           4 0.09082945
## 4           5 0.11950625
## 5           6 0.08053726
## 6           7 0.09360433

Now as an external validation we will be using the Rand Index which measures the similarity between two data clustering by using the true labels ranging from 0 to 1, with 1 indicating that the cluster being identical to the true labels and 0 indicates that clustering are dissimilar. We can see from the table that when k = 5 the Rand Index is at its highest and matches the plot of the clustered plots. With other values of clusters we can see that index is quite low when compared to k = 5. This suggests that having five cluster is the most similar to the ground truth but we have to also acknowledge that the index is still relatively low.

Non-Sensory Data Clustering

Now in the next stage we are moving into analysing the non-sensory data, which is comprised of altitude_mean_meters, Log.Category.One.Defects, Log.Category.Two.Defects, Quakers, Moisture, Log.Total.Weight, Country.Of.Origin, Processing.Methods, Variety, Color and In.Country.Partners. In this section we have both continuous and categorical variable thus we will be applying FAMD, which is a mix between PCA and multiple correspondence analysis. By now it is easy to understand that we are not using the dimensional reduction as a common technique used to cluster while retaining as much information about the original data.

# Set up the non_sensory_data 
non_sensory_data <- coffee_complete[, c("altitude_mean_meters_new", "Log.Category.One.Defects", "Log.Category.Two.Defects", "Quakers", "Moisture", "Log.Total.Weight", "Country.of.Origin", "Processing.Method", "Variety", "Color","In.Country.Partner")]

Factor Analaysis on Mixed Data on Non-Senory

# Conduct a Factory Analysis on Miced Data on the Non-Sensory Data 
famd_non_sensory <- FAMD(non_sensory_data, graph = FALSE, ncp = 60)

# Print the data for interpretation
summary(famd_non_sensory)
## 
## Call:
## FAMD(base = non_sensory_data, ncp = 60, graph = FALSE) 
## 
## 
## Eigenvalues
##                       Dim.1  Dim.2  Dim.3  Dim.4  Dim.5  Dim.6  Dim.7  Dim.8
## Variance              3.579  3.113  2.970  2.835  2.727  2.642  2.474  2.411
## % of var.             3.509  3.052  2.911  2.780  2.674  2.590  2.426  2.364
## Cumulative % of var.  3.509  6.560  9.472 12.251 14.925 17.515 19.940 22.305
##                       Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14 Dim.15 Dim.16
## Variance              2.363  2.316  2.192  2.136  1.961  1.904  1.825  1.765
## % of var.             2.317  2.271  2.149  2.094  1.922  1.866  1.790  1.731
## Cumulative % of var. 24.621 26.892 29.041 31.135 33.057 34.923 36.713 38.443
##                      Dim.17 Dim.18 Dim.19 Dim.20 Dim.21 Dim.22 Dim.23 Dim.24
## Variance              1.673  1.644  1.623  1.559  1.458  1.387  1.363  1.320
## % of var.             1.640  1.612  1.591  1.529  1.430  1.360  1.336  1.294
## Cumulative % of var. 40.084 41.696 43.287 44.816 46.246 47.605 48.941 50.235
##                      Dim.25 Dim.26 Dim.27 Dim.28 Dim.29 Dim.30 Dim.31 Dim.32
## Variance              1.309  1.283  1.243  1.229  1.198  1.147  1.146  1.114
## % of var.             1.283  1.258  1.218  1.205  1.174  1.125  1.123  1.092
## Cumulative % of var. 51.518 52.776 53.994 55.199 56.373 57.498 58.621 59.713
##                      Dim.33 Dim.34 Dim.35 Dim.36 Dim.37 Dim.38 Dim.39 Dim.40
## Variance              1.094  1.076  1.055  1.046  1.035  1.029  1.025  1.021
## % of var.             1.072  1.055  1.035  1.025  1.015  1.009  1.005  1.001
## Cumulative % of var. 60.785 61.840 62.875 63.900 64.915 65.924 66.930 67.931
##                      Dim.41 Dim.42 Dim.43 Dim.44 Dim.45 Dim.46 Dim.47 Dim.48
## Variance              1.015  1.010  1.008  1.006  1.001  1.001  1.001  1.000
## % of var.             0.995  0.991  0.988  0.986  0.982  0.981  0.981  0.981
## Cumulative % of var. 68.926 69.916 70.904 71.890 72.872 73.854 74.835 75.815
##                      Dim.49 Dim.50 Dim.51 Dim.52 Dim.53 Dim.54 Dim.55 Dim.56
## Variance              0.995  0.987  0.978  0.963  0.939  0.932  0.903  0.880
## % of var.             0.975  0.968  0.959  0.945  0.920  0.913  0.885  0.863
## Cumulative % of var. 76.790 77.758 78.717 79.662 80.582 81.495 82.381 83.244
##                      Dim.57 Dim.58 Dim.59 Dim.60
## Variance              0.832  0.816  0.808  0.800
## % of var.             0.815  0.800  0.792  0.785
## Cumulative % of var. 84.059 84.859 85.651 86.436
## 
## Individuals (the 10 first)
##                              Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2
## 1                        | 11.637 |  2.719  0.191  0.055 |  1.159  0.040  0.010
## 2                        | 11.516 |  1.960  0.099  0.029 |  0.009  0.000  0.000
## 3                        |  6.708 |  1.327  0.046  0.039 |  2.000  0.119  0.089
## 4                        | 11.734 |  2.671  0.185  0.052 |  1.450  0.063  0.015
## 5                        | 11.506 |  1.885  0.092  0.027 | -0.041  0.000  0.000
## 8                        | 14.338 |  4.592  0.545  0.103 |  5.091  0.771  0.126
## 9                        | 14.338 |  4.592  0.545  0.103 |  5.091  0.771  0.126
## 10                       | 11.591 |  1.686  0.074  0.021 |  0.526  0.008  0.002
## 11                       | 11.980 |  3.125  0.253  0.068 |  2.499  0.186  0.044
## 12                       | 12.932 |  1.901  0.094  0.022 |  1.744  0.090  0.018
##                             Dim.3    ctr   cos2  
## 1                        | -1.880  0.110  0.026 |
## 2                        | -1.819  0.103  0.025 |
## 3                        | -0.618  0.012  0.008 |
## 4                        | -1.187  0.044  0.010 |
## 5                        | -1.838  0.105  0.026 |
## 8                        | -3.050  0.290  0.045 |
## 9                        | -3.050  0.290  0.045 |
## 10                       | -1.110  0.038  0.009 |
## 11                       | -1.648  0.085  0.019 |
## 12                       | -2.625  0.215  0.041 |
## 
## Continuous variables
##                             Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
## altitude_mean_meters_new |  0.405  4.592  0.164 | -0.303  2.958  0.092 | -0.386
## Log.Category.One.Defects | -0.222  1.377  0.049 | -0.004  0.001  0.000 | -0.137
## Log.Category.Two.Defects | -0.307  2.633  0.094 | -0.193  1.198  0.037 | -0.068
## Quakers                  |  0.066  0.121  0.004 | -0.127  0.519  0.016 |  0.210
## Moisture                 | -0.434  5.272  0.189 | -0.350  3.927  0.122 | -0.037
## Log.Total.Weight         |  0.661 12.210  0.437 | -0.245  1.935  0.060 |  0.244
##                             ctr   cos2  
## altitude_mean_meters_new  5.011  0.149 |
## Log.Category.One.Defects  0.628  0.019 |
## Log.Category.Two.Defects  0.158  0.005 |
## Quakers                   1.479  0.044 |
## Moisture                  0.046  0.001 |
## Log.Total.Weight          2.007  0.060 |
## 
## Categories (the 10 first)
##                              Dim.1     ctr    cos2  v.test     Dim.2     ctr
## Brazil                   |   0.139   0.015   0.001   0.793 |   1.283   1.651
## Burundi                  |   2.590   0.097   0.012   1.937 |   1.400   0.037
## China                    |  -0.649   0.049   0.003  -1.381 |  -0.187   0.005
## Colombia                 |   2.497   6.718   0.397  17.348 |   0.440   0.275
## Costa Rica               |   0.984   0.322   0.025   3.604 |  -0.370   0.060
## Cote d?Ivoire            |  -1.324   0.013   0.002  -0.700 |   2.683   0.069
## Ecuador                  |   0.119   0.000   0.000   0.063 |   3.310   0.105
## El Salvador              |   1.068   0.148   0.011   2.415 |   0.472   0.038
## Ethiopia                 |   2.592   1.457   0.092   7.606 |   2.170   1.351
## Guatemala                |   0.341   0.131   0.009   2.436 |  -1.792   4.786
##                             cos2  v.test     Dim.3     ctr    cos2  v.test  
## Brazil                     0.070   7.837 |   4.143  18.925   0.727  25.917 |
## Burundi                    0.003   1.123 |  -0.935   0.018   0.002  -0.768 |
## China                      0.000  -0.428 |   0.804   0.109   0.004   1.880 |
## Colombia                   0.012   3.274 |  -1.168   2.134   0.087  -8.907 |
## Costa Rica                 0.003  -1.452 |   0.000   0.000   0.000  -0.001 |
## Cote d?Ivoire              0.006   1.521 |  -0.418   0.002   0.000  -0.242 |
## Ecuador                    0.010   1.876 |   0.456   0.002   0.000   0.265 |
## El Salvador                0.002   1.144 |   0.445   0.037   0.002   1.105 |
## Ethiopia                   0.065   6.831 |  -1.802   1.022   0.045  -5.805 |
## Guatemala                  0.238 -13.707 |   0.588   0.566   0.026   4.604 |

In this section we have implemented the factory analysis of mixed data in order to incorporate categorical and numerical attributes within from the data. The percentage of the variance that is explained by the first principal component is 3.50% and the variance explained by the second principal component is 3.05% with a cumulative percentage variance totaling around 6.56%.

In this instance we have included the principal components that explain 80% of the variance. In this case we will be considering 53 of the principal components. In order to extract more meaning from the plots let us try to plot different ways of interpreting the results. Also it is important to note that the altitude_mean_meters and Log.Total.Weight are well represented when compared to the rest of the variables.

scree_non_sensory <- fviz_eig(famd_non_sensory, addlabels = TRUE, ylim = c(0, 5), ncp = 45, main = "Scree Plot Non-Sensory Data") + geom_hline(yintercept = 1, color = "red", linetype = "dashed", size = 1)
contribution_non_sensory <- fviz_cos2(famd_non_sensory, choice = "var", axes = 1:36)
var_contribution_non_sensory <- var_contribution_non_sensory <- fviz_mca_var(famd_non_sensory, repel = TRUE, 
                col.var = "contrib",
                ncp = 36,
                gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

numericals_non_sensory <- fviz_famd_var(famd_non_sensory, "quanti.var", col.var = "contrib", 
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE)

habillage_var_famd <- fviz_mca_biplot(famd_non_sensory, 
                                      label = "var",
                                      ncp = 36, 
                                      habillage = coffee_complete$Grade, 
                                      select.var = list(contrib =6), # Select top 5 contributing variables
                                      labelsize = 4, # Increase the size of the labels
                                      pointsize = 2)
grid.arrange(scree_non_sensory, contribution_non_sensory, var_contribution_non_sensory, habillage_var_famd)

The first plot shows us the scree plot for the non-sensory data and we saw that since we can only explain a small variation of the data we have included many dimensions in order to find dimension where the decrease in eigenvalues become less pronounced. In addition we have included the Kaiser criterion which a rule of thumb that is commonly used to retain the component with eigenvalues greater than 1. The logic is that any variable on it own would have an eigen value of 1 so any component should explain the variance of more than 1 variable to be considered significant. We can see that the cutoff dimension using this rule of thumb is 35 principal components.

In the second plot we have the representation of the variables over the 53 dimensions that would give us a cumulative of 80% variance. We can see that Log.Total.Weight and altitude_mean_meters are well represented in the first principal components. From the plot we can see that In.Country.Partners and Country.of.Origin have the largest contributions indicating these two factors have a great influence in the variability of the data. Then followed by Processing.Method and Color also have a notable contribution to the variability in the data, whereas the altitude_mean_meters and both of the defects, Moisture and Quakers have lower contributions.

From the third plot we can see that the variables having the lowest on the lower left Quakers, Log.Category.One.Defect and Log.Category.Two.Defects have low contribution but are somewhat correlated. Also we can see that altitude_new_meters and Moisture might be correlated and also show a somewhat higher level of contribution when compared to the defects. Log.Total.Weight has alos a higher contribution when compared to the defects but it also is not correlated with any other variables. Similarly, the contribution of Processing.Methods and Color are correlated and have similar contribution as Log.Total.Weight.

When evaluating In.Country.Partners and Country.of.Origin we can see that these two attributes are related and are positioned on the top right corner of the plot indicating that they have a positive correlation. The fact that they are related makes sense because the Country.of.Origin will either have a local In.Country.Partner usually. Variety on the other is isolated and the characteristics are not shared by others.

When we compare the last biplot from the non-sensory to the sensory we can see that the data points representing the Commodity and Excellent grade are scattered with no clear clusters. Where as the grade Very Good is clustered almost everywhere but we cannot get extract any insight from the position of the variables and the data points. So we will be conducting a k-medoid clustering.

K-Medoids Clustering Non-Sensory

cluster_stats_non_sensory <- compute_cluster_stats(famd_non_sensory,2, distance = "gower")

The common approach for the “elbow method” is look for an elbow which usually signifies where the within sum of squares slows down significantly after 4 clusters but what seems like an “elbow” is around 5 number of clusters. When evaluating the silhouette methods the score peaks at 5 number of clusters. Where as the gap statistics shows the highest to be around 6 clusters. So in the next stage we will be using different values of clusters to plot how they are clustered using k-medoids.

clusters_non_sensoy <- perform_pam_and_plot(famd_non_sensory, coffee_complete$Grade, centers = c(3, 4, 5, 6, 7), 2, distance = "gower")

Performing the PAM clustering shows that our data that consists of two principal components from the factor analysis of mixed data. The plot shows that the k = 5 seems to be a good balance for the data set.

silhouette_plots_non_sensory <- compute_silhouette(famd_non_sensory, centers = c(2, 3, 5, 6, 7), 2, distance = "gower")
##   cluster size ave.sil.width
## 1       1  719          0.36
## 2       2  361          0.55
##   cluster size ave.sil.width
## 1       1  340          0.26
## 2       2  358          0.56
## 3       3  382          0.49
##   cluster size ave.sil.width
## 1       1   88          0.31
## 2       2  194          0.51
## 3       3  276          0.34
## 4       4  289          0.56
## 5       5  233          0.55
##   cluster size ave.sil.width
## 1       1   88          0.29
## 2       2  179          0.51
## 3       3  177          0.31
## 4       4  211          0.54
## 5       5  272          0.54
## 6       6  153          0.41
##   cluster size ave.sil.width
## 1       1   88          0.29
## 2       2  179          0.51
## 3       3  177          0.31
## 4       4  153          0.42
## 5       5  272          0.54
## 6       6  151          0.38
## 7       7   60          0.53

We can see that when the cluster are as also as two or three the average silhouette width are very low with 0.28 and 0.35 respectively. But as the number of clusters increase the average silhouette width increases to 0.47 when the number of clusters k = 9. But we can we can see when k = 5 we have most of the data points that have a positive silhouette score indicating that two clusters have a well matched data points and two that have some data points that are negative, meaning they can be classified in the adjacent clusters. Overall wee see that some data points are being classified to the adjacent cluster suggesting there might be some overlap between clusters which can be confirmed from the previous plot.

rand_indices_non_sensory <- compute_rand_indices(famd_non_sensory, coffee_complete$Grade, centers = 2:10, 2, distance = "gower")
rand_indices_non_sensory
##   NumClusters   RandIndex
## 1           2 0.020092127
## 2           3 0.003965216
## 3           4 0.007000317
## 4           5 0.012151546
## 5           6 0.009004665
## 6           7 0.015780379
## 7           8 0.012689104
## 8           9 0.014473793
## 9          10 0.009097114

The Rand index from the non-sensory attribute shows the data clustering between two data clustering in this case we are comparing the different number of clusters with the ground truth. We can see that most of the scores are low which show that none of the cluster are a good match with the ground truth. However when k = 8 in this case whos that it is the highest, which means it is the better match with the ground truth relative to the other clusters.

Sensory & Non-Sensory

In this stage we will be combining both the sensory and non-sensory data to create a data frame with all of the features.

# In the following we combine and conduct the same analysis
all_features <- cbind(non_sensory_data, sensory_data)

Factor Analysis on Mixed Data

As we have done before we will start by conduct a factor analysis on the mixed data to create data that is represented using the principal compinent for further clustering.

famd_all_features <- FAMD(all_features, graph = FALSE, ncp = 60)
summary(famd_all_features)
## 
## Call:
## FAMD(base = all_features, ncp = 60, graph = FALSE) 
## 
## 
## Eigenvalues
##                       Dim.1  Dim.2  Dim.3  Dim.4  Dim.5  Dim.6  Dim.7  Dim.8
## Variance              5.927  3.165  2.982  2.969  2.821  2.645  2.559  2.468
## % of var.             5.488  2.931  2.761  2.749  2.612  2.449  2.369  2.285
## Cumulative % of var.  5.488  8.418 11.179 13.928 16.540 18.989 21.359 23.644
##                       Dim.9 Dim.10 Dim.11 Dim.12 Dim.13 Dim.14 Dim.15 Dim.16
## Variance              2.379  2.329  2.259  2.175  2.118  1.959  1.880  1.824
## % of var.             2.203  2.157  2.092  2.014  1.961  1.814  1.741  1.689
## Cumulative % of var. 25.847 28.004 30.095 32.110 34.071 35.885 37.626 39.314
##                      Dim.17 Dim.18 Dim.19 Dim.20 Dim.21 Dim.22 Dim.23 Dim.24
## Variance              1.725  1.689  1.636  1.603  1.482  1.442  1.389  1.367
## % of var.             1.597  1.564  1.514  1.484  1.372  1.336  1.286  1.266
## Cumulative % of var. 40.912 42.475 43.990 45.474 46.846 48.182 49.468 50.733
##                      Dim.25 Dim.26 Dim.27 Dim.28 Dim.29 Dim.30 Dim.31 Dim.32
## Variance              1.319  1.302  1.277  1.236  1.211  1.201  1.150  1.139
## % of var.             1.221  1.205  1.182  1.144  1.121  1.112  1.065  1.054
## Cumulative % of var. 51.955 53.160 54.342 55.486 56.607 57.719 58.784 59.839
##                      Dim.33 Dim.34 Dim.35 Dim.36 Dim.37 Dim.38 Dim.39 Dim.40
## Variance              1.112  1.093  1.076  1.059  1.042  1.034  1.028  1.026
## % of var.             1.030  1.012  0.996  0.980  0.964  0.957  0.952  0.950
## Cumulative % of var. 60.869 61.881 62.877 63.857 64.822 65.779 66.731 67.681
##                      Dim.41 Dim.42 Dim.43 Dim.44 Dim.45 Dim.46 Dim.47 Dim.48
## Variance              1.021  1.018  1.013  1.009  1.006  1.005  1.002  1.001
## % of var.             0.946  0.943  0.938  0.934  0.932  0.930  0.928  0.927
## Cumulative % of var. 68.626 69.569 70.507 71.442 72.374 73.304 74.232 75.159
##                      Dim.49 Dim.50 Dim.51 Dim.52 Dim.53 Dim.54 Dim.55 Dim.56
## Variance              0.998  0.989  0.979  0.971  0.944  0.942  0.934  0.901
## % of var.             0.924  0.915  0.907  0.899  0.874  0.872  0.864  0.835
## Cumulative % of var. 76.082 76.998 77.904 78.803 79.678 80.549 81.414 82.249
##                      Dim.57 Dim.58 Dim.59 Dim.60
## Variance              0.877  0.834  0.810  0.803
## % of var.             0.812  0.772  0.750  0.743
## Cumulative % of var. 83.060 83.832 84.583 85.326
## 
## Individuals (the 10 first)
##                              Dist    Dim.1    ctr   cos2    Dim.2    ctr   cos2
## 1                        | 14.573 |  9.682  1.464  0.441 |  1.289  0.049  0.008
## 2                        | 14.089 |  8.751  1.196  0.386 |  0.509  0.008  0.001
## 3                        |  9.693 |  6.532  0.667  0.454 |  1.958  0.112  0.041
## 4                        | 13.637 |  8.208  1.052  0.362 |  1.457  0.062  0.011
## 5                        | 13.345 |  7.604  0.903  0.325 |  0.373  0.004  0.001
## 8                        | 15.868 |  9.125  1.301  0.331 |  3.765  0.415  0.056
## 9                        | 16.309 |  9.893  1.529  0.368 |  3.841  0.432  0.055
## 10                       | 13.212 |  7.000  0.766  0.281 |  0.918  0.025  0.005
## 11                       | 13.397 |  7.633  0.910  0.325 |  2.033  0.121  0.023
## 12                       | 14.104 |  6.221  0.605  0.195 |  1.485  0.065  0.011
##                             Dim.3    ctr   cos2  
## 1                        |  0.503  0.008  0.001 |
## 2                        |  1.144  0.041  0.007 |
## 3                        | -0.197  0.001  0.000 |
## 4                        |  0.327  0.003  0.001 |
## 5                        |  0.851  0.022  0.004 |
## 8                        | -2.874  0.256  0.033 |
## 9                        | -2.613  0.212  0.026 |
## 10                       |  0.994  0.031  0.006 |
## 11                       | -0.539  0.009  0.002 |
## 12                       | -1.331  0.055  0.009 |
## 
## Continuous variables (the 10 first)
##                             Dim.1    ctr   cos2    Dim.2    ctr   cos2    Dim.3
## altitude_mean_meters_new |  0.296  1.480  0.088 | -0.423  5.665  0.179 | -0.231
## Log.Category.One.Defects | -0.194  0.636  0.038 |  0.057  0.104  0.003 | -0.037
## Log.Category.Two.Defects | -0.309  1.611  0.095 | -0.096  0.293  0.009 |  0.060
## Quakers                  |  0.023  0.009  0.001 | -0.129  0.527  0.017 |  0.132
## Moisture                 | -0.331  1.844  0.109 | -0.149  0.704  0.022 |  0.314
## Log.Total.Weight         |  0.385  2.496  0.148 | -0.481  7.297  0.231 | -0.147
## Aroma                    |  0.732  9.039  0.536 |  0.017  0.010  0.000 |  0.185
## Flavor                   |  0.843 11.989  0.711 |  0.096  0.294  0.009 |  0.178
## Aftertaste               |  0.848 12.137  0.719 |  0.133  0.558  0.018 |  0.115
## Acidity                  |  0.759  9.724  0.576 |  0.011  0.004  0.000 |  0.178
##                             ctr   cos2  
## altitude_mean_meters_new  1.787  0.053 |
## Log.Category.One.Defects  0.047  0.001 |
## Log.Category.Two.Defects  0.119  0.004 |
## Quakers                   0.580  0.017 |
## Moisture                  3.315  0.099 |
## Log.Total.Weight          0.723  0.022 |
## Aroma                     1.145  0.034 |
## Flavor                    1.059  0.032 |
## Aftertaste                0.446  0.013 |
## Acidity                   1.060  0.032 |
## 
## Categories (the 10 first)
##                              Dim.1     ctr    cos2  v.test     Dim.2     ctr
## Brazil                   |   0.573   0.091   0.014   2.537 |   1.307   1.657
## Burundi                  |   0.377   0.001   0.000   0.219 |   0.165   0.001
## China                    |   0.749   0.024   0.004   1.239 |   0.780   0.090
## Colombia                 |   1.793   1.263   0.195   9.680 |  -0.710   0.694
## Costa Rica               |   0.753   0.069   0.014   2.144 |  -0.821   0.286
## Cote d?Ivoire            |  -3.044   0.024   0.008  -1.251 |   2.609   0.063
## Ecuador                  |   1.867   0.009   0.003   0.767 |   3.371   0.105
## El Salvador              |   1.412   0.095   0.020   2.480 |   0.100   0.002
## Ethiopia                 |   5.561   2.446   0.354  12.683 |   1.689   0.791
## Guatemala                |  -0.094   0.004   0.001  -0.524 |  -1.894   5.172
##                             cos2  v.test     Dim.3     ctr    cos2  v.test  
## Brazil                     0.072   7.918 |   1.643   2.952   0.114  10.257 |
## Burundi                    0.000   0.131 |  -1.643   0.056   0.005  -1.346 |
## China                      0.004   1.765 |   2.674   1.192   0.048   6.238 |
## Colombia                   0.031  -5.245 |  -2.476   9.514   0.372 -18.843 |
## Costa Rica                 0.017  -3.197 |  -0.810   0.314   0.016  -3.249 |
## Cote d?Ivoire              0.006   1.467 |  -0.778   0.006   0.001  -0.451 |
## Ecuador                    0.010   1.895 |   0.103   0.000   0.000   0.060 |
## El Salvador                0.000   0.240 |  -0.403   0.031   0.002  -0.999 |
## Ethiopia                   0.033   5.273 |  -0.902   0.254   0.009  -2.901 |
## Guatemala                  0.263 -14.368 |   0.350   0.199   0.009   2.735 |

Based on the result we can see that the variance that is explained by the first principal component is 5.44% and the second principal component explains 2.9% of the variability. The fist 60 dimensions explain 84% of the variability in the data.

scree_all <- fviz_eig(famd_all_features, addlabels = TRUE, ylim = c(0, 10), ncp = 15, main = "Scree Plot Non-Sensory Data") + geom_hline(yintercept = 1, color = "red", linetype = "dashed", size = 1)
contribution_all <- fviz_cos2(famd_non_sensory, choice = "var", axes = 1:29)
var_contribution_all  <- fviz_mca_var(famd_all_features, repel = TRUE, 
                col.var = "contrib",
                ncp = 36,
                gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

numericals_all <- fviz_famd_var(famd_all_features, "quanti.var", col.var = "contrib", 
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE)

habillage_all <- fviz_mca_biplot(famd_all_features, 
                                      label = "var",
                                      ncp = 54, 
                                      habillage = coffee_complete$Grade, 
                                      select.var = list(contrib =6), # Select top 5 contributing variables
                                      labelsize = 4, # Increase the size of the labels
                                      pointsize = 2)
grid.arrange(scree_all, contribution_all, var_contribution_all, habillage_all)

The scree plot shows noticeable decline after the first few dimensions and then the decline slows down. We can see that the biplot shows that clusters colored different representing different coffee greens can be clustered. When we compare this result with the the factor analysis on non-sensory data we can see that the distinct that were noticed in the factor analysis with sensory attributes are showing up.

K-Medoid Clustering Sensory & Non-Sensory

cluster_stats_all_features <- compute_cluster_stats(famd_all_features, 2, distance = "gower")

When analyzing the “elbow” method we can see that the within sum of square (wss) drops quickly after the fifth cluster. However the “elbow” seems like it is formed around seventh clusters. When looking at the silhouette method we can see the optimal number of cluster that is represented by the vertical line is around 8 number of clusters. Where as the gap statistics increases without a clear maximum allowing us to determine the optimal number of clusters. Let us utilize the usual k-medoid cluster for different values of k to see what would be the outcome.

clusters_all_features <- perform_pam_and_plot(famd_all_features, coffee_complete$Grade, centers = c(3, 4, 5, 6, 8) ,2,  distance = "gower")

When we compare the clusters with the ground truth we can see some inconsistencies in how they are being clustered. For example when k = 3 we can see that the coffee grade that is characterized as Very Good is the one that is being clustered into two different groups. The fact that there is an overlap is persistent for all of the clustered with no clear delineation being visible.

silhouette_all_features <- compute_silhouette(famd_all_features, centers = c(3, 4, 5, 6, 8), 2, distance = "gower")
##   cluster size ave.sil.width
## 1       1  320          0.28
## 2       2  452          0.48
## 3       3  308          0.42
##   cluster size ave.sil.width
## 1       1  307          0.26
## 2       2  253          0.39
## 3       3  231          0.47
## 4       4  289          0.40
##   cluster size ave.sil.width
## 1       1  175          0.12
## 2       2  237          0.41
## 3       3  223          0.37
## 4       4  232          0.44
## 5       5  213          0.33
##   cluster size ave.sil.width
## 1       1  113          0.12
## 2       2  226          0.38
## 3       3  136          0.44
## 4       4  187          0.37
## 5       5  230          0.44
## 6       6  188          0.29
##   cluster size ave.sil.width
## 1       1  105          0.26
## 2       2   69          0.19
## 3       3  167          0.29
## 4       4  151          0.43
## 5       5  171          0.40
## 6       6  128          0.40
## 7       7  148          0.31
## 8       8  141          0.28

rand_indices_all_features <- compute_rand_indices(famd_all_features, coffee_complete$Grade, centers = 2:10,2,  distance = "gower")
rand_indices_all_features
##   NumClusters    RandIndex
## 1           2 -0.007564600
## 2           3  0.024357062
## 3           4  0.005681346
## 4           5  0.026914014
## 5           6  0.031724289
## 6           7  0.026168363
## 7           8  0.023685378
## 8           9  0.020269551
## 9          10  0.024015375